import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import make_moons
= make_moons(n_samples=200, shuffle=True, noise=0.5, random_state=10)
X, y = pd.DataFrame(X, columns=["feature 1", "feature 2"])
X = X.plot.scatter(
ax ="feature 1",
x="feature 2",
y=y,
c="viridis",
colormap=False,
colorbar
)=ax, offset=10)
sns.despine(ax plt.tight_layout()
imbalanced data
imbalanced data
ref
https://imbalanced-learn.org/stable/references/index.html#api
https://towardsdatascience.com/imbalanced-classification-in-python-smote-tomek-links-method-6e48dfe69bbc
https://www.kaggle.com/code/rafjaa/resampling-strategies-for-imbalanced-datasets/notebook
imbalanced data
Generate the dataset
Make a dataset imbalanced
# pip install imblearn
from collections import Counter
def ratio_func(y, multiplier, minority_class):
= Counter(y)
target_stats return {minority_class: int(multiplier * target_stats[minority_class])}
from imblearn.datasets import make_imbalance
= plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
fig, axs
X.plot.scatter(="feature 1",
x="feature 2",
y=y,
c=axs[0, 0],
ax="viridis",
colormap=False,
colorbar
)0, 0].set_title("Original set")
axs[=axs[0, 0], offset=10)
sns.despine(ax
= [0.9, 0.75, 0.5, 0.25, 0.1]
multipliers for ax, multiplier in zip(axs.ravel()[1:], multipliers):
= make_imbalance(
X_resampled, y_resampled
X,
y,=ratio_func,
sampling_strategy**{"multiplier": multiplier, "minority_class": 1},
)
X_resampled.plot.scatter(="feature 1",
x="feature 2",
y=y_resampled,
c=ax,
ax="viridis",
colormap=False,
colorbar
)f"Sampling ratio = {multiplier}")
ax.set_title(=ax, offset=10)
sns.despine(ax
plt.tight_layout() plt.show()
over-sampling
Effect of the shrinkage factor in random over-sampling
from collections import Counter
from sklearn.datasets import make_classification
= make_classification(
X, y =100,
n_samples=2,
n_features=0,
n_redundant=[0.1, 0.9],
weights=0,
random_state
) Counter(y)
Counter({1: 90, 0: 10})
import matplotlib.pyplot as plt
= plt.subplots(figsize=(7, 7))
fig, ax = plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.4)
scatter = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
class_legend
ax.add_artist(class_legend)"Feature #1")
ax.set_xlabel(= ax.set_ylabel("Feature #2")
_ plt.tight_layout()
from imblearn.over_sampling import RandomOverSampler
= RandomOverSampler(random_state=0)
sampler = sampler.fit_resample(X, y)
X_res, y_res Counter(y_res)
Counter({1: 90, 0: 90})
- 부트스트랩 사용
= plt.subplots(figsize=(7, 7))
fig, ax = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4)
scatter = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
class_legend
ax.add_artist(class_legend)"Feature #1")
ax.set_xlabel(= ax.set_ylabel("Feature #2")
_ plt.tight_layout()
= RandomOverSampler(shrinkage=1, random_state=0)
sampler = sampler.fit_resample(X, y)
X_res, y_res Counter(y_res)
Counter({1: 90, 0: 90})
= plt.subplots(figsize=(7, 7))
fig, ax = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4)
scatter = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
class_legend
ax.add_artist(class_legend)"Feature #1")
ax.set_xlabel(= ax.set_ylabel("Feature #2")
_ plt.tight_layout()
= RandomOverSampler(shrinkage=3, random_state=0)
sampler = sampler.fit_resample(X, y)
X_res, y_res Counter(y_res)
Counter({1: 90, 0: 90})
= plt.subplots(figsize=(7, 7))
fig, ax = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4)
scatter = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
class_legend
ax.add_artist(class_legend)"Feature #1")
ax.set_xlabel(= ax.set_ylabel("Feature #2")
_ plt.tight_layout()
= RandomOverSampler(shrinkage=0, random_state=0)
sampler = sampler.fit_resample(X, y)
X_res, y_res Counter(y_res)
Counter({1: 90, 0: 90})
= plt.subplots(figsize=(7, 7))
fig, ax = plt.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.4)
scatter = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
class_legend
ax.add_artist(class_legend)"Feature #1")
ax.set_xlabel(= ax.set_ylabel("Feature #2")
_ plt.tight_layout()
Sample generator used in SMOTE-like samplers
print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
"poster")
sns.set_context(
= np.random.RandomState(18)
rng
= plt.subplots(figsize=(8, 8))
f, ax
# generate some data points
= np.array([3.65284, 3.52623, 3.51468, 3.22199, 3.21])
y = np.array([0.43, 0.45, 0.6, 0.4, 0.211])
z = np.array([3.3, 3.6])
y_2 = np.array([0.58, 0.34])
z_2
# plot the majority and minority samples
="Minority class", s=100)
ax.scatter(z, y, label="Majority class", s=100)
ax.scatter(z_2, y_2, label
= rng.randint(len(y), size=2)
idx = [r"$x_i$", r"$x_{zi}$"]
annotation
for a, i in zip(annotation, idx):
=tuple([z[i] + 0.01, y[i] + 0.005]), fontsize=15)
ax.annotate(a, (z[i], y[i]), xytext
# draw the circle in which the new sample will generated
= np.sqrt((z[idx[0]] - z[idx[1]]) ** 2 + (y[idx[0]] - y[idx[1]]) ** 2)
radius = plt.Circle((z[idx[0]], y[idx[0]]), radius=radius, alpha=0.2)
circle
ax.add_artist(circle)
# plot the line on which the sample will be generated
"--", alpha=0.5)
ax.plot(z[idx], y[idx],
# create and plot the new sample
= rng.uniform()
step = y[idx[0]] + step * (y[idx[1]] - y[idx[0]])
y_gen = z[idx[0]] + step * (z[idx[1]] - z[idx[0]])
z_gen
=100)
ax.scatter(z_gen, y_gen, s
ax.annotate(r"$x_{new}$",
(z_gen, y_gen),=tuple([z_gen + 0.01, y_gen + 0.005]),
xytext=15,
fontsize
)
# make the plot nicer with legend and label
=ax, offset=10)
sns.despine(ax0.2, 0.7])
ax.set_xlim([3.2, 3.7])
ax.set_ylim([r"$X_1$")
plt.xlabel(r"$X_2$")
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
Automatically created module for IPython interactive environment
under-sampling
Illustration of the definition of a Tomek link
print(__doc__)
import matplotlib.pyplot as plt
import seaborn as sns
"poster") sns.set_context(
Automatically created module for IPython interactive environment
def make_plot_despine(ax):
=ax, offset=10)
sns.despine(ax0, 3])
ax.set_xlim([0, 3])
ax.set_ylim([r"$X_1$")
ax.set_xlabel(r"$X_2$")
ax.set_ylabel(="lower right") ax.legend(loc
import numpy as np
= np.random.RandomState(18)
rng
= np.transpose(
X_minority 1.1, 1.3, 1.15, 0.8, 0.55, 2.1], [1.0, 1.5, 1.7, 2.5, 0.55, 1.9]]
[[
)= np.transpose(
X_majority
[2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45],
[1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9],
[
] )
= plt.subplots(figsize=(8, 8))
fig, ax
ax.scatter(0],
X_minority[:, 1],
X_minority[:, ="Minority class",
label=200,
s="_",
marker
)
ax.scatter(0],
X_majority[:, 1],
X_majority[:, ="Majority class",
label=200,
s="+",
marker
)
# highlight the samples of interest
ax.scatter(-1, 0], X_majority[1, 0]],
[X_minority[-1, 1], X_majority[1, 1]],
[X_minority[="Tomek link",
label=200,
s=0.3,
alpha
)
make_plot_despine(ax)"Illustration of a Tomek link")
fig.suptitle( fig.tight_layout()
from imblearn.under_sampling import TomekLinks
= plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
fig, axs
= {
samplers "Removing only majority samples": TomekLinks(sampling_strategy="auto"),
"Removing all samples": TomekLinks(sampling_strategy="all"),
}
for ax, (title, sampler) in zip(axs, samplers.items()):
= sampler.fit_resample(
X_res, y_res
np.vstack((X_minority, X_majority)),0] * X_minority.shape[0] + [1] * X_majority.shape[0]),
np.array([
)
ax.scatter(== 0][:, 0],
X_res[y_res == 0][:, 1],
X_res[y_res ="Minority class",
label=200,
s="_",
marker
)
ax.scatter(== 1][:, 0],
X_res[y_res == 1][:, 1],
X_res[y_res ="Majority class",
label=200,
s="+",
marker
)
# highlight the samples of interest
ax.scatter(-1, 0], X_majority[1, 0]],
[X_minority[-1, 1], X_majority[1, 1]],
[X_minority[="Tomek link",
label=200,
s=0.3,
alpha
)
ax.set_title(title)
make_plot_despine(ax)
fig.tight_layout()
plt.show()
Sample selection in NearMiss
print(__doc__)
import seaborn as sns
"poster") sns.set_context(
Automatically created module for IPython interactive environment
def make_plot_despine(ax):
=ax, offset=10)
sns.despine(ax0, 3.5])
ax.set_xlim([0, 3.5])
ax.set_ylim([0, 3.6, 0.5))
ax.set_xticks(np.arange(0, 3.6, 0.5))
ax.set_yticks(np.arange(r"$X_1$")
ax.set_xlabel(r"$X_2$")
ax.set_ylabel(="upper left", fontsize=16) ax.legend(loc
import numpy as np
= np.random.RandomState(18)
rng
= np.transpose(
X_minority 1.1, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55], [1.0, 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]]
[[
)= np.transpose(
X_majority
[2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45],
[1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9],
[
] )
NearMiss-1
가장 가까운 이웃의 평균 거리가 가장 작은 다수 클래스에서 표본 선택
3-NN사용하여 특정 샘플 2개에 대한 평균 거리 계산
평균거리가 더 작아 녹색 점선으로 연결된 점 선택
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
= plt.subplots(figsize=(8, 8))
fig, ax
ax.scatter(0],
X_minority[:, 1],
X_minority[:, ="Minority class",
label=200,
s="_",
marker
)
ax.scatter(0],
X_majority[:, 1],
X_majority[:, ="Majority class",
label=200,
s="+",
marker
)
= NearestNeighbors(n_neighbors=3)
nearest_neighbors
nearest_neighbors.fit(X_minority)= nearest_neighbors.kneighbors(X_majority[:2, :])
dist, ind = dist.sum(axis=1) / 3
dist_avg
for positive_idx, (neighbors, distance, color) in enumerate(
zip(ind, dist_avg, ["g", "r"])
):for make_plot, sample_idx in enumerate(neighbors):
ax.plot(0], X_minority[sample_idx, 0]],
[X_majority[positive_idx, 1], X_minority[sample_idx, 1]],
[X_majority[positive_idx, "--" + color,
=0.3,
alpha=f"Avg. dist.={distance:.2f}" if make_plot == 0 else "",
label
)"NearMiss-1")
ax.set_title(
make_plot_despine(ax) plt.tight_layout()
NearMiss-2
가장 먼 이웃의 평균 거리가 가장 작은 샘플 선택
멀리 있는 세 이웃의 거리가 가장 작은 녹색 점 선택
= plt.subplots(figsize=(8, 8))
fig, ax
ax.scatter(0],
X_minority[:, 1],
X_minority[:, ="Minority class",
label=200,
s="_",
marker
)
ax.scatter(0],
X_majority[:, 1],
X_majority[:, ="Majority class",
label=200,
s="+",
marker
)
= NearestNeighbors(n_neighbors=X_minority.shape[0])
nearest_neighbors
nearest_neighbors.fit(X_minority)= nearest_neighbors.kneighbors(X_majority[:2, :])
dist, ind = dist[:, -3::]
dist = ind[:, -3::]
ind = dist.sum(axis=1) / 3
dist_avg
for positive_idx, (neighbors, distance, color) in enumerate(
zip(ind, dist_avg, ["g", "r"])
):for make_plot, sample_idx in enumerate(neighbors):
ax.plot(0], X_minority[sample_idx, 0]],
[X_majority[positive_idx, 1], X_minority[sample_idx, 1]],
[X_majority[positive_idx, "--" + color,
=0.3,
alpha=f"Avg. dist.={distance:.2f}" if make_plot == 0 else "",
label
)"NearMiss-2")
ax.set_title(
make_plot_despine(ax) plt.tight_layout()
NearMiss-3
가장 가까운 이웃은 다수 클래스의 샘플을 short-list 하는데 사용
가장 가까운 이웃의 평균 거리가 가장 큰 표본 선택
= plt.subplots(figsize=(8.5, 8.5))
fig, ax
ax.scatter(0],
X_minority[:, 1],
X_minority[:, ="Minority class",
label=200,
s="_",
marker
)
ax.scatter(0],
X_majority[:, 1],
X_majority[:, ="Majority class",
label=200,
s="+",
marker
)
= NearestNeighbors(n_neighbors=3)
nearest_neighbors
nearest_neighbors.fit(X_majority)
# select only the majority point of interest
= nearest_neighbors.kneighbors(X_minority, return_distance=False)
selected_idx = X_majority[np.unique(selected_idx), :]
X_majority
ax.scatter(0],
X_majority[:, 1],
X_majority[:, ="Short-listed samples",
label=200,
s=0.3,
alpha="g",
color
)= NearestNeighbors(n_neighbors=3)
nearest_neighbors
nearest_neighbors.fit(X_minority)= nearest_neighbors.kneighbors(X_majority[:2, :])
dist, ind = dist.sum(axis=1) / 3
dist_avg
for positive_idx, (neighbors, distance, color) in enumerate(
zip(ind, dist_avg, ["r", "g"])
):for make_plot, sample_idx in enumerate(neighbors):
ax.plot(0], X_minority[sample_idx, 0]],
[X_majority[positive_idx, 1], X_minority[sample_idx, 1]],
[X_majority[positive_idx, "--" + color,
=0.3,
alpha=f"Avg. dist.={distance:.2f}" if make_plot == 0 else "",
label
)"NearMiss-3")
ax.set_title(
make_plot_despine(ax)
plt.tight_layout() plt.show()